output: html_document: default pdf_document: default —
library(MASS)
library(tidyverse)
library(kableExtra)
library(latex2exp)
library(gplots)
library(pROC)
library(GGally)
library(knitr)
set.seed(1984)
n <- 100
x1 <- rnorm(n = n, mean = 10, sd = sqrt(10))
x2 <- rnorm(n = n, mean = 5, sd = sqrt(5))
y1 <- 1.6 + 0 * x1 - 2 * x2 + rnorm(n = n, mean = 0, sd = 3)
simulatedata=data.frame(x1,x2,y1)
Although we simulated x1 and x2, assume that they are deterministic quantities. (a) Write the equation of a normal linear model which has x1 and x2 as predictors, in terms of βs, σs, and ε along with the distributional assumptions of\(\epsilon\). What are the true parameter values of β0, β1, β2, and σ? Are the true parameter values fixed (non-random) or random quantities?
\[y=\beta_0+\beta_1 x_1+\beta_2 x_2+\epsilon \] where \(\epsilon \sim N(\mu,\sigma)\) and \(\beta_0=1.6\), \(\beta_1=0\), \(\beta_1=-2\),\(\mu=0, \ \sigma=3\).
simulation of alternative versions
y2 <- 1.6+0*x1-2*x2+ rcauchy(n = n, location = 0, scale = 3)
y3 <- 1.6+0*x1-2*x2+ rnorm(n = n, mean = 0, sd = 1:n*3)
par(mfrow=c(1,3))
plot(density(y1))
plot(density(y2))
plot(density(y3))
In \(y2\) Cauchy distribution is used with un known variance. In \(y3\) has normal error with not fixed variance. So in the both \(y2\) and \(y3\) will not made a normal model.
fit.simple=lm(y1~x1+x2, data = simulatedata)
S=summary(fit.simple)
S
##
## Call:
## lm(formula = y1 ~ x1 + x2, data = simulatedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8036 -1.9811 -0.3691 2.0812 8.2220
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.71237 1.16542 -1.469 0.145
## x1 0.20195 0.09703 2.081 0.040 *
## x2 -1.82409 0.13615 -13.397 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.862 on 97 degrees of freedom
## Multiple R-squared: 0.6501, Adjusted R-squared: 0.6429
## F-statistic: 90.12 on 2 and 97 DF, p-value: < 2.2e-16
fit.simple$coef
## (Intercept) x1 x2
## -1.7123651 0.2019501 -1.8240887
S$sigma # sigma
## [1] 2.862193
R2=S$r.squared
R2
## [1] 0.6501148
R2ad=S$adj.r.squared
R2ad
## [1] 0.6429007
S$sigma
## [1] 2.862193
\[y=\beta_0+\beta_1 x_1+\beta_2 x_2+\epsilon\] where \(\beta_0=-1.71237\), \(\beta_1=0.20195\) and \(\beta_2=-1.82409\) and \(\epsilon \sim N(0,2.862193)\).
p_val=S$coefficients[2,4]
p_val # since p_value is significant and less than .05 we reject the null hypothesis
## [1] 0.04004259
confint(fit.simple)
## 2.5 % 97.5 %
## (Intercept) -4.025395861 0.6006656
## x1 0.009368959 0.3945312
## x2 -2.094316008 -1.5538615
newData <- data.frame(x1=12, x2=7)
modelPrediction <- predict(fit.simple, newData, interval = 'predict')
modelPrediction
## fit lwr upr
## 1 -12.05759 -17.80471 -6.310463
L <- 25000 # number of simulations
df <- data.frame(iteration = 1:L, beta0 = 0, beta1 = 0, beta2=0 ,sigma =0)
# We will use the x vector we created at the beginning of this document but we'll
# recalculate y for each experiment with a new draw from the error distribution
for (i in 1:L) {
ySim <- 1.6 + 0 * x1 - 2 * x2 + rnorm(n = n, mean = 0, sd = 3)
# Fit model and store result in data frame
l1 <- lm(ySim ~ x1+x2)
df$beta0[i] <- coef(l1)[1]
df$beta1[i] <- coef(l1)[2]
df$beta2[i]<-coef(l1)[3]
df$sigma[i] <- summary(l1)$sigma
}
df %>%
gather(var, val, -iteration) %>%
ggplot(aes(x = val)) +
geom_density() +
facet_wrap(~var, scales = 'free') +
geom_vline(data = tibble(val = apply(df[, 2:4], 2, mean),
var = colnames(df[, 2:4])),
aes(xintercept = val), lty = 2, col = 'red')
We can see that coefficients and sigma are normally distributed and they have mean which we expected.
bikesharing<-read.csv('bikesharing.csv', sep=';')
bikesharing$season=factor(bikesharing$season, labels = c('spring','summer','fall','winter'))
is.factor(bikesharing$season)
## [1] TRUE
stats=bikesharing[,c('registered', 'temp', 'hum','windspeed')]
number.na <- apply(stats, 2, function(x) sum(is.na(x)))
means <- round(apply(stats, 2, mean),3)
stds <- round(apply(stats, 2, sd),3)
quantiles <- round(apply(stats, 2, quantile),3)
table.data.1 <- as.data.frame(t(as.matrix(as.data.frame(rbind(number.na, means, stds, quantiles)))))
colnames(table.data.1) <- c('NA values','Mean', 'Std','Minimum', '25% quantile', 'Median', '75% quantile', 'Maximum')
table.data.1 %>%
kbl() %>%
kable_styling(full_width = F)
| NA values | Mean | Std | Minimum | 25% quantile | Median | 75% quantile | Maximum | |
|---|---|---|---|---|---|---|---|---|
| registered | 0 | 3656.172 | 1560.256 | 20.000 | 2497.000 | 3662.000 | 4776.500 | 6946.000 |
| temp | 0 | 0.495 | 0.183 | 0.059 | 0.337 | 0.498 | 0.655 | 0.862 |
| hum | 0 | 0.628 | 0.142 | 0.000 | 0.520 | 0.627 | 0.730 | 0.973 |
| windspeed | 0 | 0.190 | 0.077 | 0.022 | 0.135 | 0.181 | 0.233 | 0.507 |
drop=c(1,4)
heatmap.2(cor(bikesharing[,-drop]))
ggplot(bikesharing, aes(x=season, y=registered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
xlab("season")
As we expect we can see when the weather becomes warmer people like to use bike more than cold season. In sprint and summer there is an increasing in the median of registration however we have opposite trend in fall and winter.
fitseas=lm(registered~season, data = bikesharing)
S1=summary(fitseas)
S1
##
## Call:
## lm(formula = registered ~ season, data = bikesharing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4136.5 -1042.0 -181.1 1111.3 3576.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3528.1 100.9 34.961 < 2e-16 ***
## seasonsummer 919.6 142.7 6.444 2.12e-10 ***
## seasonfall 628.4 143.1 4.391 1.29e-05 ***
## seasonwinter -1049.5 143.3 -7.324 6.42e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1369 on 727 degrees of freedom
## Multiple R-squared: 0.2335, Adjusted R-squared: 0.2303
## F-statistic: 73.8 on 3 and 727 DF, p-value: < 2.2e-16
S1$fstatistic
## value numdf dendf
## 73.80183 3.00000 727.00000
anova(fitseas)
## Analysis of Variance Table
##
## Response: registered
## Df Sum Sq Mean Sq F value Pr(>F)
## season 3 414867209 138289070 73.802 < 2.2e-16 ***
## Residuals 727 1362244763 1873789
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(fitseas))
## Df Sum Sq Mean Sq F value Pr(>F)
## season 3 4.149e+08 138289070 73.8 <2e-16 ***
## Residuals 727 1.362e+09 1873789
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As p_value is <2e-16 we can reject the null hypothesis.
TukeyHSD(aov(fitseas))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = fitseas)
##
## $season
## diff lwr upr p adj
## summer-spring 919.6359 552.1499 1287.1219 0.0000000
## fall-spring 628.4351 259.9409 996.9293 0.0000762
## winter-spring -1049.5123 -1418.5179 -680.5067 0.0000000
## fall-summer -291.2008 -659.6950 77.2934 0.1761548
## winter-summer -1969.1482 -2338.1537 -1600.1426 0.0000000
## winter-fall -1677.9474 -2047.9570 -1307.9377 0.0000000
TukeyHSD(aov(fitseas))$season %>%
as_tibble(rownames = 'Comparison') %>%
rename(p.adj = 5) %>%
filter(p.adj < 0.05) %>%
kbl(align = 'c') %>%
kable_styling()
| Comparison | diff | lwr | upr | p.adj |
|---|---|---|---|---|
| summer-spring | 919.6359 | 552.1499 | 1287.1219 | 0.00e+00 |
| fall-spring | 628.4351 | 259.9409 | 996.9293 | 7.62e-05 |
| winter-spring | -1049.5123 | -1418.5179 | -680.5067 | 0.00e+00 |
| winter-summer | -1969.1482 | -2338.1537 | -1600.1426 | 0.00e+00 |
| winter-fall | -1677.9474 | -2047.9570 | -1307.9377 | 0.00e+00 |
We see that the p-value for each comparison is less than 0.05 and thus there exists a significant difference between every group. The first line states that the difference between the summer and spring is 919.6359, with a 95% confidence interval of 552.1499 and 1287.1219; the difference is significant. similarly for other lines.
plot(TukeyHSD(aov(fitseas)))
Each pair there is significant different except maybe fall-summer.
fit2=lm(registered~season+temp+hum+windspeed, data = bikesharing)
Sfit2=summary(fit2)
Sfit2
##
## Call:
## lm(formula = registered ~ season + temp + hum + windspeed, data = bikesharing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4077.3 -942.2 -130.6 979.1 2812.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3452.35 334.46 10.322 < 2e-16 ***
## seasonsummer -547.61 166.30 -3.293 0.00104 **
## seasonfall 627.88 128.94 4.869 1.38e-06 ***
## seasonwinter -45.27 155.61 -0.291 0.77120
## temp 5517.23 456.05 12.098 < 2e-16 ***
## hum -2945.36 340.56 -8.649 < 2e-16 ***
## windspeed -3607.80 609.49 -5.919 4.99e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1208 on 724 degrees of freedom
## Multiple R-squared: 0.4057, Adjusted R-squared: 0.4008
## F-statistic: 82.38 on 6 and 724 DF, p-value: < 2.2e-16
Sfit2$sigma
## [1] 1207.764
Sfit2$r.squared
## [1] 0.4057242
The following dummy variables \(d_i\) respect to season (to avoid writing 4 equation) then \[ \hat{y}=\beta_0+\beta_1 d_{summer}+ \beta_2 d_{fall}+\beta_3 d_{winter}+\beta_4 x_1+\beta_5 x_2+\beta_6 x_3\]
models <- c(1,2)
R.adj <- rep(0,2)
aic <- rep(0,2)
RMSE<- rep(0,2)
Nub.Predictors=rep(0,2)
R.adj[1]=S1$adj.r.squared
R.adj[2]=Sfit2$adj.r.squared
Nub.Predictors[1]=length(fitseas$coefficients)
Nub.Predictors[2]=length(fit2$coefficients)
aic[1]=AIC(fitseas)
aic[2]=AIC(fit2)
RMSE[1]=sqrt(mean((bikesharing$registered - predict(fitseas))^2))
RMSE[2]=sqrt(mean((bikesharing$registered - predict(fit2))^2))
table.data.3 <- as.data.frame(cbind(models,Nub.Predictors ,R.adj, aic, RMSE))
colnames(table.data.3) <- c('Model','Number.Predictors' , '$R_{\\text{adj}}^2$', 'AIC', '$\\text{RMSE}$')
table.data.3 %>%
kbl() %>%
kable_styling(full_width = F)
| Model | Number.Predictors | \(R_{\text{adj}}^2\) | AIC | \(\text{RMSE}\) |
|---|---|---|---|---|
| 1 | 4 | 0.2302870 | 12638.66 | 1365.114 |
| 2 | 7 | 0.4007993 | 12458.58 | 1201.967 |
Model in f has lower AIC.
fort.2 <- fortify(fit2)
fort.2$jackknife <- rstudent(fit2)
fort.2$rn <- row.names(fort.2)
n <- nrow(fort.2)
p <- ncol(model.matrix(fit2))
fort.2$obsN <- row.names(fort.2)
fort.2$index <- 1:nrow(fort.2)
fort.2 %>%
ggplot(aes(x = index, y = .hat)) +
geom_point() +
geom_hline(yintercept = 2*p/n, lty = 2, col = 'red') +
geom_text(aes(label = ifelse(.hat > 2*p/n, obsN, '')), hjust = -0.5)
We see that observation 50 , 69 has the highest leverage but there are other potential high leverage points such as observation 302, 45, and, 65, some more. see the following table:
fort.2 %>%
filter(.hat > 2*p/n) %>%
arrange(desc(.hat)) %>%
select(obsN, .hat) %>%
kbl(align = 'c') %>%
kable_styling()
| obsN | .hat |
|---|---|
| 50 | 0.0404726 |
| 69 | 0.0330918 |
| 302 | 0.0285446 |
| 239 | 0.0241487 |
| 65 | 0.0240762 |
| 45 | 0.0232324 |
| 90 | 0.0224346 |
| 627 | 0.0209779 |
| 694 | 0.0208881 |
| 293 | 0.0204762 |
| 668 | 0.0203174 |
| 62 | 0.0198301 |
fort.2 %>%
dplyr::select(.stdresid, jackknife, rn) %>%
gather(residual, value, -rn) %>%
mutate(residual = factor(residual,
levels = c('.stdresid', 'jackknife'))) %>%
ggplot(aes(x = rn, y = value)) +
geom_point(size = 1) +
geom_hline(yintercept = 0, lty = 2, col = 'red') +
geom_hline(yintercept = 2.5, lty = 2, col = 'green') +
geom_hline(yintercept = -2.5, lty = 2, col = 'green') +
geom_hline(yintercept = 3, lty = 2, col = 'red') +
geom_hline(yintercept = -3, lty = 2, col = 'red') +
geom_text(aes(label = ifelse(abs(value) > 3, rn, '')), vjust = -0.5) +
xlab("Index") +
ylab("Residuals") +
ggtitle("Index plot of standardized and jackknife residuals") +
theme(plot.title = element_text(hjust = 0.5, size = 15)) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
facet_wrap(~residual, scales = 'free')
From the studentized and jackknife residual points we see that there are observations exceed the (heuristic) threshold of 2. Let’s confirm our graphical exploration by comparing our jackknife residual to the theoretical t value we obtain after Bonferroni corrections.
p = length(coef(fit2))
n = length(fitted(fit2))
fort.2$obsN <- row.names(fort.2)
alpha <- 0.05
theoreticalT <- qt(p = 1 - alpha/(2 * n), n - p - 1)
theoreticalNoBon <- qt(p = 1 - alpha/2, n - p - 1)
tab <- tibble(jackknife = abs(fort.2$jackknife[abs(fort.2$jackknife) > 3]),
theoreticalT = theoreticalT ,
OutlierBon = jackknife > theoreticalT ,
theoreticalNoBon = theoreticalNoBon,
OutlierNoBon = jackknife > theoreticalNoBon) %>%
kbl(align = 'c') %>%
kable_styling(full_width = F)
tab
| jackknife | theoreticalT | OutlierBon | theoreticalNoBon | OutlierNoBon |
|---|---|---|---|---|
| 3.459055 | 4.005112 | FALSE | 1.963251 | TRUE |
fort.2 %>%
ggplot(aes(x = index, y = .cooksd)) +
geom_point() +
geom_hline(yintercept = 0.05, lty = 2) +
geom_text(aes(label = ifelse(.cooksd > 0.05, obsN, '')), hjust = -0.5)
suspicious <- c(50,69,302,239,45,65, 293,62,668,694,627)
fort.2 %>%
filter(obsN %in% suspicious) %>%
mutate(highLeverage = .hat > 2*p/n,
outlier = abs(jackknife) > theoreticalT,
influential = .cooksd > 0.05) %>%
select(obsN, highLeverage, outlier, influential) %>%
mutate(totalMarks = highLeverage + outlier + influential) %>%
kbl(align = 'c') %>%
kable_styling()
| obsN | highLeverage | outlier | influential | totalMarks |
|---|---|---|---|---|
| 45 | TRUE | FALSE | FALSE | 1 |
| 50 | TRUE | FALSE | FALSE | 1 |
| 62 | TRUE | FALSE | FALSE | 1 |
| 65 | TRUE | FALSE | FALSE | 1 |
| 69 | TRUE | FALSE | TRUE | 2 |
| 239 | TRUE | FALSE | FALSE | 1 |
| 293 | TRUE | FALSE | FALSE | 1 |
| 302 | TRUE | FALSE | FALSE | 1 |
| 627 | TRUE | FALSE | FALSE | 1 |
| 668 | TRUE | FALSE | FALSE | 1 |
| 694 | TRUE | FALSE | FALSE | 1 |
indexes <- c(50,69,302,239,45,65, 293,62,668,694,627)
r.adj <- rep(0,11)
for(i in 1:11){
bikesharing.update <- bikesharing[-indexes[i],]
fit2.update<-lm(registered~season+temp+hum+windspeed, data = bikesharing.update )
r.adj[i] = summary(fit2.update)$adj.r.squared
}
table.data.2 <- as.data.frame(cbind(indexes, r.adj))
colnames(table.data.2) <- c('Index', '$R_{\\text{adj}}^2$')
table.data.2 %>%
kbl() %>%
kable_styling(full_width = F)
| Index | \(R_{\text{adj}}^2\) |
|---|---|
| 50 | 0.4012560 |
| 69 | 0.4073814 |
| 302 | 0.3982012 |
| 239 | 0.4003234 |
| 45 | 0.4006081 |
| 65 | 0.3979838 |
| 293 | 0.4008911 |
| 62 | 0.4001896 |
| 668 | 0.4002501 |
| 694 | 0.4005677 |
| 627 | 0.4007928 |
The computed R2adj in our first model is 0.4007. The result from the table and a more closer examination of the models we created in the for loop indicate that removal of these points is indeed useful.
fort.2 %>%
dplyr::select(.resid, rn) %>%
gather(residual, value, -rn) %>%
mutate(residual = factor(residual,
levels = '.resid')) %>%
ggplot(aes(x = rn, y = value)) +
geom_point(size = 1) +
geom_hline(yintercept = 0, lty = 2, col = 'red') +
geom_hline(yintercept = 2750, lty = 2, col = 'red') +
geom_hline(yintercept = -2750, lty = 2, col = 'red') +
geom_text(aes(label = ifelse(abs(value) >2750, rn, '')), vjust = -0.5) +
xlab("Index") +
ylab("Residuals") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Index plot of residuals") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
There seems to be a raised trend in our residuals.
tibble(fort.2$.stdresid) %>%
gather(type, val) %>%
ggplot(aes(sample = val)) +
stat_qq(size = 1) +
geom_abline(slope = 1, intercept = 0, lty = 2, col = 'blue') +
xlab("Theoretical value") +
ylab("Sample value") +
ggtitle("QQ-plot of the residuals") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
We see evidence of non-normality. This is the effect of some points on this model.
tibble(yFitted = fort.2$.fitted, residual = fort.2$.resid, rn = fort.2$rn) %>%
ggplot(aes(x = yFitted, y = residual)) +
geom_point(size = 1) +
geom_hline(yintercept = 0, lty = 2, col = 'red') +
geom_smooth(method = 'loess') +
xlab("Fitted value") +
ylab("Residual") +
ggtitle("Residual vs. fitted value") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
(j)Based on the results from items (g), (h) and (i) suggest improvements to the model.
the variable seasonwinter is not significant we have to consider the model that not consider season winter as a variables. this could help to improve the model.
To summarize our findings. There are some deviations from the normal assumptions, there seems to be heteroskedasticity, there are some some points seems to be outliers and influential points. We can potentially improve our model by applying transformations.
We continue with the bikesharing.csv data and the model you fitted in the previous section.
fit2=lm(registered~season+temp+hum+windspeed, data = bikesharing)
bc<-boxcox(fit2)
lambda <- bc$x[which.max(bc$y)]
lambda
## [1] 0.7070707
bikesharing$log_registered<-log(bikesharing$registered)
bikesharing$T_lambda_registered<-((bikesharing$registered)^(lambda)-1)/lambda
plot(density(bikesharing$registered))
plot(density(bikesharing$log_registered))
plot(density(bikesharing$T_lambda_registered))
It seams that Transformation with boxcox and without transformation is better than log transformation.
I add workingday, holiday, variable casual. Others are similar or has small variation.
fitlog=lm(log_registered~season+temp+hum+windspeed+workingday+holiday+casual, data = bikesharing)
Sl=summary(fitlog)
Sl
##
## Call:
## lm(formula = log_registered ~ season + temp + hum + windspeed +
## workingday + holiday + casual, data = bikesharing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7004 -0.1925 0.0275 0.2418 0.8553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.351e+00 1.205e-01 61.012 < 2e-16 ***
## seasonsummer -1.097e-01 5.466e-02 -2.007 0.0451 *
## seasonfall 2.099e-01 4.143e-02 5.065 5.20e-07 ***
## seasonwinter 1.007e-01 5.136e-02 1.960 0.0504 .
## temp 1.099e+00 1.667e-01 6.594 8.29e-11 ***
## hum -7.173e-01 1.148e-01 -6.248 7.09e-10 ***
## windspeed -9.029e-01 2.014e-01 -4.484 8.54e-06 ***
## workingday 6.059e-01 4.427e-02 13.688 < 2e-16 ***
## holiday -7.905e-02 8.961e-02 -0.882 0.3780
## casual 4.084e-04 3.678e-05 11.104 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3864 on 721 degrees of freedom
## Multiple R-squared: 0.5387, Adjusted R-squared: 0.5329
## F-statistic: 93.54 on 9 and 721 DF, p-value: < 2.2e-16
Sl$sigma
## [1] 0.3863846
Sl$adj.r.squared
## [1] 0.5329133
Sl$r.squared
## [1] 0.5386719
All the variables except holiday and seasonwinter all variables are significant.
models <- c(1,2,3)
R.adj <- rep(0,3)
aic <- rep(0,3)
RMSE<- rep(0,3)
R.adj[1]=S1$adj.r.squared
R.adj[2]=Sfit2$adj.r.squared
R.adj[3]=Sl$adj.r.squared
aic[1]=AIC(fitseas)
aic[2]=AIC(fit2)
aic[3]=AIC(fitlog)
RMSE[1]=sqrt(mean((bikesharing$registered - predict(fitseas))^2))
RMSE[2]=sqrt(mean((bikesharing$registered - predict(fit2))^2))
RMSE[3]=sqrt(mean((bikesharing$registered - exp(predict(fitlog)))^2))
N.predictors=rep(0,3)
N.predictors[1]=length(fitseas$coefficients)
N.predictors[2]=length(fit2$coefficients)
N.predictors[3]=length(fitlog$coefficients)
table.data.3 <- as.data.frame(cbind(models,N.predictors ,R.adj, aic, RMSE))
colnames(table.data.3) <- c('Model','Nub.var' ,'$R_{\\text{adj}}^2$', 'AIC', '$\\text{RMSE}$')
table.data.3 %>%
kbl() %>%
kable_styling(full_width = F)
| Model | Nub.var | \(R_{\text{adj}}^2\) | AIC | \(\text{RMSE}\) |
|---|---|---|---|---|
| 1 | 4 | 0.2302870 | 12638.656 | 1365.114 |
| 2 | 7 | 0.4007993 | 12458.576 | 1201.967 |
| 3 | 10 | 0.5329133 | 696.171 | 1056.917 |
AIC and RMSE of model 3 is very smaller than others. Model 3 is better than one and two.
fort.3 <- fortify(fitlog)
fort.3$.jackknife <- rstudent(fitlog)
fort.3$rn <- row.names(fort.3)
fort.3 %>%
dplyr::select(.stdresid, .jackknife, rn) %>%
gather(residual, value, -rn) %>%
mutate(residual = factor(residual,
levels = c('.stdresid', '.jackknife'))) %>%
ggplot(aes(x = rn, y = value)) +
geom_point(size = 1) +
geom_hline(yintercept = 0, lty = 2, col = 'red') +
geom_hline(yintercept = 3, lty = 2, col = 'red') +
geom_hline(yintercept = -3, lty = 2, col = 'red') +
geom_text(aes(label = ifelse(abs(value) > 3, rn, '')), vjust = -0.5) +
xlab("Index") +
ylab("Residuals") +
ggtitle("Index plot of standardized and jackknife residuals") +
theme(plot.title = element_text(hjust = 0.5, size = 15)) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
facet_wrap(~residual, scales = 'free')
p = length(coef(fitlog))
n = length(fitted(fitlog))
alpha <- 0.05
tCrit <- qt(p = 1 - alpha/(2 * n), n - p - 1)
tNoBon <- qt(p = 1 - alpha/2, n - p - 1)
tab <- tibble(jackknife = abs(fort.3$.jackknife[abs(fort.3$.jackknife) > 3]),
tCrit = tCrit,
OutlierBon = jackknife > tCrit,
tNoBon = tNoBon,
OutlierNoBon = jackknife > tNoBon) %>%
kbl(align = 'c') %>%
kable_styling(full_width = F)
tab
| jackknife | tCrit | OutlierBon | tNoBon | OutlierNoBon |
|---|---|---|---|---|
| 3.340542 | 4.00521 | FALSE | 1.963264 | TRUE |
| 4.347921 | 4.00521 | TRUE | 1.963264 | TRUE |
| 4.860311 | 4.00521 | TRUE | 1.963264 | TRUE |
| 13.821477 | 4.00521 | TRUE | 1.963264 | TRUE |
| 3.643416 | 4.00521 | FALSE | 1.963264 | TRUE |
theoreticalT <- qt(p = 1 - 0.05/(2 * n), df = n - p - 1)
fort.3$obsN <- row.names(fort.3)
fort.3 %>%
mutate(theoretical = theoreticalT,
rejectNull = abs(.jackknife) > theoretical) %>%
filter(rejectNull == T) %>%
select(obsN, .jackknife, theoretical, rejectNull) %>%
kbl(align = 'c') %>%
kable_styling()
| obsN | .jackknife | theoretical | rejectNull |
|---|---|---|---|
| 27 | -4.347921 | 4.00521 | TRUE |
| 69 | -4.860311 | 4.00521 | TRUE |
| 668 | -13.821477 | 4.00521 | TRUE |
This shows that points 27, 69 and 668 are outliears.
fort.3 %>%
ggplot(aes(x = rn, y = .hat)) +
geom_point(size = 1) +
geom_hline(yintercept = (2*p)/n, lty = 2, col = 'red') +
geom_text(aes(label = ifelse(.hat > 2*p/n, rn, '')), hjust = -0.5) +
xlab("Index") +
ylab("Leverages") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Index plot of leverages") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
suspicious <- c(27,50,69,668,150,105,204,17,185,315,248,283,381,367,360,416, 463, 472,52,514,551,52,612,647,682,725,692 ,293,694,627)
fort.3 %>%
filter(obsN %in% suspicious) %>%
mutate(highLeverage = .hat > 2*p/n,
outlier = abs(.jackknife) > theoreticalT,
influential = .cooksd > 0.05) %>%
select(obsN, highLeverage, outlier, influential) %>%
mutate(totalMarks = highLeverage + outlier + influential) %>%
kbl(align = 'c') %>%
kable_styling()
| obsN | highLeverage | outlier | influential | totalMarks |
|---|---|---|---|---|
| 17 | TRUE | FALSE | FALSE | 1 |
| 27 | FALSE | TRUE | FALSE | 1 |
| 50 | TRUE | FALSE | FALSE | 1 |
| 52 | TRUE | FALSE | FALSE | 1 |
| 69 | TRUE | TRUE | TRUE | 3 |
| 105 | TRUE | FALSE | FALSE | 1 |
| 150 | TRUE | FALSE | FALSE | 1 |
| 185 | TRUE | FALSE | FALSE | 1 |
| 204 | TRUE | FALSE | FALSE | 1 |
| 248 | TRUE | FALSE | FALSE | 1 |
| 283 | TRUE | FALSE | FALSE | 1 |
| 293 | FALSE | FALSE | FALSE | 0 |
| 315 | TRUE | FALSE | FALSE | 1 |
| 360 | TRUE | FALSE | FALSE | 1 |
| 367 | TRUE | FALSE | FALSE | 1 |
| 381 | TRUE | FALSE | FALSE | 1 |
| 416 | TRUE | FALSE | FALSE | 1 |
| 463 | TRUE | FALSE | FALSE | 1 |
| 472 | TRUE | FALSE | FALSE | 1 |
| 514 | TRUE | FALSE | FALSE | 1 |
| 551 | TRUE | FALSE | FALSE | 1 |
| 612 | TRUE | FALSE | FALSE | 1 |
| 627 | FALSE | FALSE | FALSE | 0 |
| 647 | TRUE | FALSE | FALSE | 1 |
| 668 | FALSE | TRUE | TRUE | 2 |
| 682 | TRUE | FALSE | FALSE | 1 |
| 692 | TRUE | FALSE | FALSE | 1 |
| 694 | FALSE | FALSE | FALSE | 0 |
| 725 | TRUE | FALSE | FALSE | 1 |
We can see that 27, 688 and 69 are outliear and influential.
fort.3 %>%
dplyr::select(.resid, rn) %>%
gather(residual, value, -rn) %>%
mutate(residual = factor(residual,
levels = '.resid')) %>%
ggplot(aes(x = rn, y = value)) +
geom_point(size = 1) +
geom_hline(yintercept = 0, lty = 2, col = 'red') +
geom_hline(yintercept = 2, lty = 2, col = 'red') +
geom_hline(yintercept = -2, lty = 2, col = 'red') +
geom_text(aes(label = ifelse(abs(value) >2, rn, '')), vjust = -0.5) +
xlab("Index") +
ylab("Residuals") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Index plot of residuals") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
tibble(fort.3$.stdresid) %>%
gather(type, val) %>%
ggplot(aes(sample = val)) +
stat_qq(size = 1) +
geom_abline(slope = 1, intercept = 0, lty = 2, col = 'blue') +
xlab("Theoretical value") +
ylab("Sample value") +
ggtitle("QQ-plot of the residuals") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
It is clear that there are some points which are not normal. However, it seams that residuals are not normally distributed and in the residual-index plot dots are not equally distribute.
car::avPlots(fitlog, id.n=2, id.cex=0.7)
easily we can see that points 69 and 688 are outliear of the each plot.
we can remove the outliers (Not at the same time ) one by one to fit the model to find better fit.
Any way I would like to do see the model with boxcox transformation.
fit.bc=lm(T_lambda_registered~season+temp+hum+windspeed+workingday+holiday, data = bikesharing)
fort.4 <- fortify(fit.bc)
fort.4$.jackknife <- rstudent(fit.bc)
fort.4$rn <- row.names(fort.4)
fort.4 %>%
dplyr::select(.resid, rn) %>%
gather(residual, value, -rn) %>%
mutate(residual = factor(residual,
levels = '.resid')) %>%
ggplot(aes(x = rn, y = value)) +
geom_point(size = 1) +
geom_hline(yintercept = 0, lty = 2, col = 'red') +
geom_hline(yintercept = 250, lty = 2, col = 'red') +
geom_hline(yintercept = -250, lty = 2, col = 'red') +
geom_text(aes(label = ifelse(abs(value) >250, rn, '')), vjust = -0.5) +
xlab("Index") +
ylab("Residuals") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Index plot of residuals") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
tibble(fort.4$.stdresid) %>%
gather(type, val) %>%
ggplot(aes(sample = val)) +
stat_qq(size = 1) +
geom_abline(slope = 1, intercept = 0, lty = 2, col = 'blue') +
xlab("Theoretical value") +
ylab("Sample value") +
ggtitle("QQ-plot of the residuals") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
For this problem we will use theGusto.txt data. The data consists of 2188 observations and 26 columns. The variable of interest is DAY30 which represents the 30-day survival of patients who suffered myocardial infarction (heart attack). If DAY30 = 0, the patient survived and if DAY30 = 1 the patient died. We are interested in determining which risk factors are associated with the survival.
We are going to limit our analysis to the following variables:AGE: age (years);SEX: 0 = male; 1 = female; DIA: 0 = no diabetes; 1 = diabetes; HTN: 0 =no hypertension; 1 = hypertension; SMK: 1 = current smoker; 2 = formersmoker; 3 = never smoked; PMI: 0 = no previous myocardial infarction; 1 =previous myocardial infarction, HEI: height (cm); WEI: weight (kg).
library(readr)
GustoW <- read_table2("/Users/farhadzare/Desktop/Applied linear reg/GustoW.txt")
GustoW<-GustoW[,c(1,2,3,7,11,12,13,14,15)]
GustoW$SMK=factor(GustoW$SMK, labels = c('current','former','never'))
is.factor(GustoW$SMK)
## [1] TRUE
GustoW$BMI=GustoW$WEI/(GustoW$HEI^2)
drop <- c("HEI","WEI")
GustoW_update = GustoW[,!(names(GustoW) %in% drop)]
GustoW_update[,-7] %>%
gather(variable, value, -DAY30) %>%
group_by(DAY30, variable) %>%
summarize(n = n(),
mean = mean(value),
sd = sd(value),
min = min(value),
q25 = quantile(value, 0.25),
median = median(value),
q75 = quantile(value, 0.75),
max = max(value)) %>%
arrange(variable, DAY30) %>%
kbl(align = 'c', booktabs = T) %>%
kable_styling(full_width = F)
| DAY30 | variable | n | mean | sd | min | q25 | median | q75 | max |
|---|---|---|---|---|---|---|---|---|---|
| 0 | AGE | 2053 | 59.7516254 | 11.7210154 | 23.9100000 | 50.5160000 | 59.8750000 | 69.0470000 | 88.8440000 |
| 1 | AGE | 135 | 71.3814296 | 11.3569271 | 31.4220000 | 65.1250000 | 73.0160000 | 79.3205000 | 89.4840000 |
| 0 | BMI | 2053 | 0.0028006 | 0.0005132 | 0.0014255 | 0.0024580 | 0.0027303 | 0.0030577 | 0.0058125 |
| 1 | BMI | 135 | 0.0026160 | 0.0005332 | 0.0016764 | 0.0022826 | 0.0025489 | 0.0028028 | 0.0049867 |
| 0 | DIA | 2053 | 0.1402825 | 0.3473645 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 |
| 1 | DIA | 135 | 0.1777778 | 0.3837495 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 |
| 0 | HTN | 2053 | 0.4018509 | 0.4903916 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 | 1.0000000 |
| 1 | HTN | 135 | 0.4296296 | 0.4968669 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 | 1.0000000 |
| 0 | PMI | 2053 | 0.1626887 | 0.3691714 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 |
| 1 | PMI | 135 | 0.3037037 | 0.4615689 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 | 1.0000000 |
| 0 | SEX | 2053 | 0.2367267 | 0.4251767 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 |
| 1 | SEX | 135 | 0.4296296 | 0.4968669 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 | 1.0000000 |
GustoW_update %>%
group_by(DAY30, SMK) %>%
count() %>%
ggplot(aes(x = SMK, y = n, fill = factor(DAY30))) +
geom_bar(stat = 'identity', position = 'dodge') +
scale_fill_brewer(type = 'seq', palette = 'Set1') +
theme(legend.position = 'bottom')
glm1 <- glm( DAY30~ ., data = GustoW_update, family = binomial)
glm1S=summary(glm1)
glm1S
##
## Call:
## glm(formula = DAY30 ~ ., family = binomial, data = GustoW_update)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0961 -0.3865 -0.2450 -0.1497 3.4711
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.90120 1.00686 -7.847 4.25e-15 ***
## SEX 0.47193 0.20304 2.324 0.020106 *
## AGE 0.08948 0.01089 8.215 < 2e-16 ***
## DIA 0.31345 0.25287 1.240 0.215145
## PMI 0.73394 0.20891 3.513 0.000443 ***
## HTN -0.16735 0.19513 -0.858 0.391098
## SMKformer -0.19232 0.25526 -0.753 0.451196
## SMKnever -0.15030 0.24984 -0.602 0.547453
## BMI -336.98688 216.19819 -1.559 0.119069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1013.57 on 2187 degrees of freedom
## Residual deviance: 862.68 on 2179 degrees of freedom
## AIC: 880.68
##
## Number of Fisher Scoring iterations: 6
library(pROC)
Roc0=roc(GustoW_update$DAY30, predict(glm1))
plot(Roc0,print.auc=TRUE)
AUC=auc(roc(GustoW_update$DAY30, predict(glm1)))
AUC
## Area under the curve: 0.7836
BRIER=mean((GustoW_update$DAY30 - predict(glm1, type = 'response'))^2)
BRIER
## [1] 0.05187348
y-axes shows true positive rate x-axes shows false positive rate.
glm1S$coefficients[glm1S$coefficients[,4]>.2,]
## Estimate Std. Error z value Pr(>|z|)
## DIA 0.3134467 0.2528728 1.2395428 0.2151445
## HTN -0.1673493 0.1951305 -0.8576272 0.3910983
## SMKformer -0.1923167 0.2552573 -0.7534231 0.4511957
## SMKnever -0.1503009 0.2498433 -0.6015807 0.5474533
AICData <- data.frame(Iteration = 1:4,predictors_Eliminate=0,
AIC = 0, AUC=0, Brier=0)
AICData[1, 2]<-'NA'
AICData[1, 3] <- AIC(glm1)
AICData[1, 4] <- auc(roc(GustoW_update$DAY30, predict(glm1)))
AICData[1, 5] <- mean((GustoW_update$DAY30 - predict(glm1, type = 'response'))^2)
lmTmp <- update(glm1 ,.~.-SMK )
lmTmpS=summary(lmTmp)
AICData[2, 2]<-'SMK'
AICData[2, 3] <- AIC(lmTmp)
AICData[2, 4] <- auc(roc(GustoW_update$DAY30, predict(lmTmp)))
AICData[2, 5] <- mean((GustoW_update$DAY30 - predict(lmTmp, type = 'response'))^2)
lmTmp <- update(glm1 ,.~.-HTN-SMK)
lmTmpS=summary(lmTmp)
AICData[3, 2]<-'HTN+SMK'
AICData[3, 3] <- AIC(lmTmp)
AICData[3, 4] <- auc(roc(GustoW_update$DAY30, predict(lmTmp)))
AICData[3, 5] <- mean((GustoW_update$DAY30 - predict(lmTmp, type = 'response'))^2)
lmTmp <- update(glm1 ,.~.-HTN-DIA-SMK)
lmTmpS=summary(lmTmp)
AICData[4, 2]<-'HTN+SMK+DIA'
AICData[4, 3] <- AIC(lmTmp)
AICData[4, 4] <- auc(roc(GustoW_update$DAY30, predict(lmTmp)))
AICData[4, 5] <- mean((GustoW_update$DAY30 - predict(lmTmp, type = 'response'))^2)
AICData %>%
kbl(align = 'c') %>%
kable_styling()
| Iteration | predictors_Eliminate | AIC | AUC | Brier |
|---|---|---|---|---|
| 1 | NA | 880.6832 | 0.7835507 | 0.0518735 |
| 2 | SMK | 877.2835 | 0.7832945 | 0.0518943 |
| 3 | HTN+SMK | 876.0971 | 0.7825693 | 0.0519763 |
| 4 | HTN+SMK+DIA | 875.3988 | 0.7810792 | 0.0519730 |
Checking :
library (leaps)
back=step(glm1)
## Start: AIC=880.68
## DAY30 ~ SEX + AGE + DIA + PMI + HTN + SMK + BMI
##
## Df Deviance AIC
## - SMK 2 863.28 877.28
## - HTN 1 863.42 879.42
## - DIA 1 864.15 880.15
## <none> 862.68 880.68
## - BMI 1 865.23 881.23
## - SEX 1 867.99 883.99
## - PMI 1 874.18 890.18
## - AGE 1 942.70 958.70
##
## Step: AIC=877.28
## DAY30 ~ SEX + AGE + DIA + PMI + HTN + BMI
##
## Df Deviance AIC
## - HTN 1 864.10 876.10
## - DIA 1 864.69 876.69
## <none> 863.28 877.28
## - BMI 1 866.26 878.26
## - SEX 1 869.56 881.56
## - PMI 1 874.83 886.83
## - AGE 1 953.48 965.48
##
## Step: AIC=876.1
## DAY30 ~ SEX + AGE + DIA + PMI + BMI
##
## Df Deviance AIC
## - DIA 1 865.40 875.40
## <none> 864.10 876.10
## - BMI 1 867.80 877.80
## - SEX 1 869.86 879.86
## - PMI 1 875.59 885.59
## - AGE 1 953.59 963.59
##
## Step: AIC=875.4
## DAY30 ~ SEX + AGE + PMI + BMI
##
## Df Deviance AIC
## <none> 865.40 875.40
## - BMI 1 868.34 876.34
## - SEX 1 871.27 879.27
## - PMI 1 877.25 885.25
## - AGE 1 955.93 963.93
stepWiseLM <- stepAIC(object = glm1, direction = 'backward', trace = F)
summary(stepWiseLM)
##
## Call:
## glm(formula = DAY30 ~ SEX + AGE + PMI + BMI, family = binomial,
## data = GustoW_update)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1541 -0.3903 -0.2418 -0.1526 3.4729
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.769e+00 9.736e-01 -7.979 1.47e-15 ***
## SEX 4.756e-01 1.941e-01 2.450 0.014281 *
## AGE 8.575e-02 9.945e-03 8.622 < 2e-16 ***
## PMI 7.440e-01 2.084e-01 3.570 0.000357 ***
## BMI -3.448e+02 2.061e+02 -1.673 0.094326 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1013.6 on 2187 degrees of freedom
## Residual deviance: 865.4 on 2183 degrees of freedom
## AIC: 875.4
##
## Number of Fisher Scoring iterations: 6
Roc1=roc(GustoW_update$DAY30, predict(stepWiseLM ))
plot(Roc1,print.auc=TRUE)
fort.final <- fortify(stepWiseLM)
fort.final$.jackknife <- rstudent(stepWiseLM)
fort.final$rn <- row.names(fort.final)
stepWiseLMS=summary(stepWiseLM)
devRes <- stepWiseLMS$deviance.resid
data.frame(y = GustoW_update$DAY30,
index = 1:nrow(GustoW_update),
deviance = devRes) %>%
ggplot(aes(x = index, y = deviance, col = y)) +
geom_point() +
geom_hline(yintercept = -3, lty = 2, col = 'red') +
geom_hline(yintercept = 3, lty = 2, col = 'red') +
geom_hline(yintercept = 0, lty = 2) +
geom_text(aes(label = ifelse(abs(deviance) > 3, index, '')), vjust = -0.5)
GustoW_update %>%
mutate(deviance = devRes,
index = row_number()) %>%
gather(variable, value, -deviance, -DAY30, -index) %>%
ggplot(aes(x = value, y = deviance, color = DAY30)) +
geom_point() +
facet_wrap(~variable, scales = 'free') +
geom_hline(yintercept = -2, lty = 2, col = 'red') +
geom_hline(yintercept = 2, lty = 2, col = 'red') +
geom_hline(yintercept = 0, lty = 2) +
geom_text(aes(label = ifelse(abs(deviance) > 2, index, '')), vjust = -0.5) +
stat_smooth(method="loess",se=F)
data.frame(index = 1:nrow(GustoW_update),
y = GustoW_update$DAY30,
predict = stepWiseLM$fitted.values, # can also use predict with type = response
deviance = devRes) %>%
ggplot(aes(x = predict, y = deviance, color = y)) +
geom_point() +
geom_hline(yintercept = -2, lty = 2, col = 'red') +
geom_hline(yintercept = 2, lty = 2, col = 'red') +
geom_hline(yintercept = 0, lty = 2) +
geom_text(aes(label = ifelse(abs(deviance) > 2.5, index, '')), vjust = -0.5) +
stat_smooth(method="loess", se = F) # I choose 2.5 to remove number above some points
qqnorm(devRes)
(f) Determine the predicted probability of an individual with the following predictor values:AGE = 67; SEX = 0; DIA = 0; HTN = 0; SMK = 3; PMI = 1; HEI =187; WEI = 115. Compute the 95% confidence interval of the predicted probability for this individual. State the predicted probability and the confidence interval.
stepWiseLM$coefficients
## (Intercept) SEX AGE PMI BMI
## -7.7687705 0.4756117 0.0857483 0.7439695 -344.7912561
x0 <- c(1,0,67,1,115/(187^2))
S <- vcov(stepWiseLM)
S
## (Intercept) SEX AGE PMI
## (Intercept) 9.479684e-01 0.0067995017 -7.997535e-03 -1.666891e-04
## SEX 6.799502e-03 0.0376816926 -3.607839e-04 4.241197e-03
## AGE -7.997535e-03 -0.0003607839 9.891219e-05 -9.632451e-05
## PMI -1.666891e-04 0.0042411975 -9.632451e-05 4.343084e-02
## BMI -1.437937e+02 0.8054436177 4.581698e-01 -2.646149e+00
## BMI
## (Intercept) -143.7936512
## SEX 0.8054436
## AGE 0.4581698
## PMI -2.6461493
## BMI 42473.4701493
eta <- t(x0) %*% coef(stepWiseLM)
se <- qnorm(0.975) * sqrt(t(x0) %*% S %*% x0)
eta
## [,1]
## [1,] -2.413555
c(eta - se, eta + se)
## [1] -2.84560 -1.98151
# probability scale
exp(eta)/(1 + exp(eta))
## [,1]
## [1,] 0.08214488
# The 95% confidence interval is as follows:
exp(c(eta - se, eta + se))/(1 + exp(c(eta - se, eta + se)))
## [1] 0.05490921 0.12115794